{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import dendropy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "ebola_raxml = dendropy.Tree.get_from_path('my_ebola.nex', 'nexus')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'BDBV_KC545395': 2 2\n",
      "'BDBV_KC545396': 3 3\n",
      "'EBOV_2007_KC242788': 11 11\n",
      "'EBOV_2007_KC242787': 11 11\n",
      "'EBOV_2007_KC242786': 12 12\n",
      "'EBOV_2007_KC242789': 12 12\n",
      "'EBOV_2007_KC242784': 11 11\n",
      "'EBOV_2007_KC242785': 10 10\n",
      "'EBOV_2007_KC242790': 10 10\n",
      "'EBOV_1995_KC242799': 10 10\n",
      "'EBOV_1995_KC242796': 10 10\n",
      "'EBOV_1976_KC242801': 10 10\n",
      "'EBOV_1976_AF272001': 10 10\n",
      "'EBOV_2014_KM034549': 9 9\n",
      "'EBOV_2014_KM034550': 10 10\n",
      "'EBOV_2014_KM034555': 11 11\n",
      "'EBOV_2014_KM034560': 16 16\n",
      "'EBOV_2014_KM034553': 18 18\n",
      "'EBOV_2014_KM034552': 18 18\n",
      "'EBOV_2014_KM034556': 18 18\n",
      "'EBOV_2014_KM034557': 18 18\n",
      "'EBOV_2014_KM034551': 16 16\n",
      "'EBOV_2014_KM034558': 16 16\n",
      "'EBOV_2014_KM034562': 14 14\n",
      "'EBOV_2014_KM034554': 13 13\n",
      "'EBOV_2014_KM034559': 13 13\n",
      "'EBOV_2014_KM034561': 13 13\n",
      "'EBOV_2014_KM034563': 8 8\n",
      "'SUDV_EU338380': 9 9\n",
      "'SUDV_KC242783': 10 10\n",
      "'SUDV_FJ968794': 10 10\n",
      "'SUDV_KC589025': 9 9\n",
      "'SUDV_AY729654': 10 10\n",
      "'SUDV_JN638998': 10 10\n",
      "'RESTV_FJ621584': 8 8\n",
      "'RESTV_FJ621585': 10 10\n",
      "'RESTV_JX477166': 11 11\n",
      "'RESTV_AB050936': 11 11\n",
      "'RESTV_JX477165': 10 10\n",
      "'RESTV_FJ621583': 10 10\n",
      "'TAFV_FJ217162': 5 5\n",
      "'BDBV_FJ217161': 4 4\n",
      "'BDBV_KC545394': 1 1\n",
      "'BDBV_KC545393': 1 1\n"
     ]
    }
   ],
   "source": [
    "def compute_level(node, level=0):\n",
    "    for child in node.child_nodes():\n",
    "        compute_level(child, level + 1)\n",
    "    if node.taxon is not None:\n",
    "        print(\"%s: %d %d\" % (node.taxon, node.level(), level))\n",
    "\n",
    "compute_level(ebola_raxml.seed_node)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'BDBV_KC545395': 0 2\n",
      "'BDBV_KC545396': 0 3\n",
      "'EBOV_2007_KC242788': 0 11\n",
      "'EBOV_2007_KC242787': 0 11\n",
      "Internal: 1 10\n",
      "'EBOV_2007_KC242786': 0 12\n",
      "'EBOV_2007_KC242789': 0 12\n",
      "Internal: 1 11\n",
      "'EBOV_2007_KC242784': 0 11\n",
      "Internal: 2 10\n",
      "Internal: 3 9\n",
      "'EBOV_2007_KC242785': 0 10\n",
      "'EBOV_2007_KC242790': 0 10\n",
      "Internal: 1 9\n",
      "Internal: 4 8\n",
      "'EBOV_1995_KC242799': 0 10\n",
      "'EBOV_1995_KC242796': 0 10\n",
      "Internal: 1 9\n",
      "'EBOV_1976_KC242801': 0 10\n",
      "'EBOV_1976_AF272001': 0 10\n",
      "Internal: 1 9\n",
      "Internal: 2 8\n",
      "Internal: 5 7\n",
      "'EBOV_2014_KM034549': 0 9\n",
      "'EBOV_2014_KM034550': 0 10\n",
      "'EBOV_2014_KM034555': 0 11\n",
      "'EBOV_2014_KM034560': 0 16\n",
      "'EBOV_2014_KM034553': 0 18\n",
      "'EBOV_2014_KM034552': 0 18\n",
      "Internal: 1 17\n",
      "'EBOV_2014_KM034556': 0 18\n",
      "'EBOV_2014_KM034557': 0 18\n",
      "Internal: 1 17\n",
      "Internal: 2 16\n",
      "Internal: 3 15\n",
      "'EBOV_2014_KM034551': 0 16\n",
      "'EBOV_2014_KM034558': 0 16\n",
      "Internal: 1 15\n",
      "Internal: 4 14\n",
      "'EBOV_2014_KM034562': 0 14\n",
      "Internal: 5 13\n",
      "'EBOV_2014_KM034554': 0 13\n",
      "Internal: 6 12\n",
      "'EBOV_2014_KM034559': 0 13\n",
      "'EBOV_2014_KM034561': 0 13\n",
      "Internal: 1 12\n",
      "Internal: 7 11\n",
      "Internal: 8 10\n",
      "Internal: 9 9\n",
      "Internal: 10 8\n",
      "'EBOV_2014_KM034563': 0 8\n",
      "Internal: 11 7\n",
      "Internal: 12 6\n",
      "'SUDV_EU338380': 0 9\n",
      "'SUDV_KC242783': 0 10\n",
      "'SUDV_FJ968794': 0 10\n",
      "Internal: 1 9\n",
      "Internal: 2 8\n",
      "'SUDV_KC589025': 0 9\n",
      "'SUDV_AY729654': 0 10\n",
      "'SUDV_JN638998': 0 10\n",
      "Internal: 1 9\n",
      "Internal: 2 8\n",
      "Internal: 3 7\n",
      "'RESTV_FJ621584': 0 8\n",
      "'RESTV_FJ621585': 0 10\n",
      "'RESTV_JX477166': 0 11\n",
      "'RESTV_AB050936': 0 11\n",
      "Internal: 1 10\n",
      "Internal: 2 9\n",
      "'RESTV_JX477165': 0 10\n",
      "'RESTV_FJ621583': 0 10\n",
      "Internal: 1 9\n",
      "Internal: 3 8\n",
      "Internal: 4 7\n",
      "Internal: 5 6\n",
      "Internal: 13 5\n",
      "'TAFV_FJ217162': 0 5\n",
      "Internal: 14 4\n",
      "'BDBV_FJ217161': 0 4\n",
      "Internal: 15 3\n",
      "Internal: 16 2\n",
      "Internal: 17 1\n",
      "'BDBV_KC545394': 0 1\n",
      "'BDBV_KC545393': 0 1\n",
      "Internal: 18 0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "18"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def compute_height(node):\n",
    "    children = node.child_nodes()\n",
    "    if len(children) == 0:\n",
    "        height = 0\n",
    "    else:\n",
    "        height = 1 + max(map(lambda x: compute_height(x), children))\n",
    "    desc = node.taxon or 'Internal'\n",
    "    print(\"%s: %d %d\" % (desc, height, node.level()))\n",
    "    return height\n",
    "\n",
    "compute_height(ebola_raxml.seed_node)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Internal: 3 0\n"
     ]
    }
   ],
   "source": [
    "def compute_nofs(node):\n",
    "    children = node.child_nodes()\n",
    "    nofs = len(children)\n",
    "    map(lambda x: compute_nofs(x), children)\n",
    "    desc = node.taxon or 'Internal'\n",
    "    print(\"%s: %d %d\" % (desc, nofs, node.level()))\n",
    "\n",
    "compute_nofs(ebola_raxml.seed_node)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'BDBV_KC545395' (2)\n",
      "'BDBV_KC545396' (3)\n",
      "'EBOV_2007_KC242788' (11)\n",
      "'EBOV_2007_KC242787' (11)\n",
      "'EBOV_2007_KC242786' (12)\n",
      "'EBOV_2007_KC242789' (12)\n",
      "'EBOV_2007_KC242784' (11)\n",
      "'EBOV_2007_KC242785' (10)\n",
      "'EBOV_2007_KC242790' (10)\n",
      "'EBOV_1995_KC242799' (10)\n",
      "'EBOV_1995_KC242796' (10)\n",
      "'EBOV_1976_KC242801' (10)\n",
      "'EBOV_1976_AF272001' (10)\n",
      "'EBOV_2014_KM034549' (9)\n",
      "'EBOV_2014_KM034550' (10)\n",
      "'EBOV_2014_KM034555' (11)\n",
      "'EBOV_2014_KM034560' (16)\n",
      "'EBOV_2014_KM034553' (18)\n",
      "'EBOV_2014_KM034552' (18)\n",
      "'EBOV_2014_KM034556' (18)\n",
      "'EBOV_2014_KM034557' (18)\n",
      "'EBOV_2014_KM034551' (16)\n",
      "'EBOV_2014_KM034558' (16)\n",
      "'EBOV_2014_KM034562' (14)\n",
      "'EBOV_2014_KM034554' (13)\n",
      "'EBOV_2014_KM034559' (13)\n",
      "'EBOV_2014_KM034561' (13)\n",
      "'EBOV_2014_KM034563' (8)\n",
      "'SUDV_EU338380' (9)\n",
      "'SUDV_KC242783' (10)\n",
      "'SUDV_FJ968794' (10)\n",
      "'SUDV_KC589025' (9)\n",
      "'SUDV_AY729654' (10)\n",
      "'SUDV_JN638998' (10)\n",
      "'RESTV_FJ621584' (8)\n",
      "'RESTV_FJ621585' (10)\n",
      "'RESTV_JX477166' (11)\n",
      "'RESTV_AB050936' (11)\n",
      "'RESTV_JX477165' (10)\n",
      "'RESTV_FJ621583' (10)\n",
      "'TAFV_FJ217162' (5)\n",
      "'BDBV_FJ217161' (4)\n",
      "'BDBV_KC545394' (1)\n",
      "'BDBV_KC545393' (1)\n"
     ]
    }
   ],
   "source": [
    "def print_nodes(node):\n",
    "    for child in node.child_nodes():\n",
    "        print_nodes(child)\n",
    "    if node.taxon is not None:\n",
    "        print('%s (%d)' % (node.taxon, node.level()))\n",
    "\n",
    "print_nodes(ebola_raxml.seed_node)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'BDBV_KC545394' (1)\n",
      "'BDBV_KC545393' (1)\n",
      "'BDBV_KC545395' (2)\n",
      "'BDBV_KC545396' (3)\n",
      "'BDBV_FJ217161' (4)\n",
      "'TAFV_FJ217162' (5)\n",
      "'EBOV_2014_KM034563' (8)\n",
      "'RESTV_FJ621584' (8)\n",
      "'EBOV_2014_KM034549' (9)\n",
      "'SUDV_EU338380' (9)\n",
      "'SUDV_KC589025' (9)\n",
      "'EBOV_2007_KC242785' (10)\n",
      "'EBOV_2007_KC242790' (10)\n",
      "'EBOV_1995_KC242799' (10)\n",
      "'EBOV_1995_KC242796' (10)\n",
      "'EBOV_1976_KC242801' (10)\n",
      "'EBOV_1976_AF272001' (10)\n",
      "'EBOV_2014_KM034550' (10)\n",
      "'SUDV_KC242783' (10)\n",
      "'SUDV_FJ968794' (10)\n",
      "'SUDV_AY729654' (10)\n",
      "'SUDV_JN638998' (10)\n",
      "'RESTV_FJ621585' (10)\n",
      "'RESTV_JX477165' (10)\n",
      "'RESTV_FJ621583' (10)\n",
      "'EBOV_2007_KC242788' (11)\n",
      "'EBOV_2007_KC242787' (11)\n",
      "'EBOV_2007_KC242784' (11)\n",
      "'EBOV_2014_KM034555' (11)\n",
      "'RESTV_JX477166' (11)\n",
      "'RESTV_AB050936' (11)\n",
      "'EBOV_2007_KC242786' (12)\n",
      "'EBOV_2007_KC242789' (12)\n",
      "'EBOV_2014_KM034554' (13)\n",
      "'EBOV_2014_KM034559' (13)\n",
      "'EBOV_2014_KM034561' (13)\n",
      "'EBOV_2014_KM034562' (14)\n",
      "'EBOV_2014_KM034560' (16)\n",
      "'EBOV_2014_KM034551' (16)\n",
      "'EBOV_2014_KM034558' (16)\n",
      "'EBOV_2014_KM034553' (18)\n",
      "'EBOV_2014_KM034552' (18)\n",
      "'EBOV_2014_KM034556' (18)\n",
      "'EBOV_2014_KM034557' (18)\n"
     ]
    }
   ],
   "source": [
    "from collections import deque\n",
    "\n",
    "def print_breadth(tree):\n",
    "    queue = deque()\n",
    "    queue.append(tree.seed_node)\n",
    "    while len(queue) > 0:\n",
    "        process_node = queue.popleft()\n",
    "        if process_node.taxon is not None:\n",
    "            print('%s (%d)' % (process_node.taxon, process_node.level()))\n",
    "        else:\n",
    "            for child in process_node.child_nodes():\n",
    "                queue.append(child)\n",
    "\n",
    "print_breadth(ebola_raxml)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'BDBV'} 1\n",
      "{'BDBV'} 1\n",
      "{'EBOV2007'} 7\n",
      "{'EBOV1995'} 2\n",
      "{'EBOV1976'} 2\n",
      "{'EBOV2014'} 15\n",
      "{'SUDV'} 6\n",
      "{'RESTV'} 6\n",
      "{'TAFV'} 1\n",
      "{'BDBV'} 1\n",
      "{'BDBV'} 1\n",
      "{'BDBV'} 1\n"
     ]
    }
   ],
   "source": [
    "from copy import deepcopy\n",
    "simple_ebola = deepcopy(ebola_raxml)\n",
    "\n",
    "def simplify_tree(node):\n",
    "    prefs = set()\n",
    "    for leaf in node.leaf_nodes():\n",
    "        my_toks = leaf.taxon.label.split(' ')[0].split('_')\n",
    "        if my_toks[0] == 'EBOV':\n",
    "            prefs.add('EBOV' + my_toks[1])\n",
    "        else:\n",
    "            prefs.add(my_toks[0])\n",
    "    if len(prefs) == 1:\n",
    "        print(prefs, len(node.leaf_nodes()))\n",
    "        node.taxon = dendropy.Taxon(label=list(prefs)[0])\n",
    "        #node.collapse_clade()\n",
    "        node.set_child_nodes([])\n",
    "    else:\n",
    "        for child in node.child_nodes():\n",
    "            simplify_tree(child)\n",
    "\n",
    "simplify_tree(simple_ebola.seed_node)\n",
    "simple_ebola.ladderize()\n",
    "simple_ebola.write_to_path('ebola_simple.nex', 'nexus')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
